rm(list = ls())
library(ggplot2)
library(data.table)
library(ebal)
library(foreign)
library(plyr)
library(extrafont)

setwd("")

source("code/function-createdemograph.R")

#Make t0/t1 phone vs. online demographics graph
#Pull in data
t1.t0.phone_mail.data <- read.dta('data/t1_t0_phone_mail_anon.dta')
#All other tranches are online only

#Calculate parameters for Footnote.
#We began with a list of 46,720 registered voters.
nrow(t1.t0.phone_mail.data)
#37,086 had landline or mobile phone numbers available.
table(t1.t0.phone_mail.data$has_phone)
#We randomly assigned those with phone numbers to phone (N=10,722)
table(t1.t0.phone_mail.data$universe, t1.t0.phone_mail.data$has_phone)
#or mail- to-online surveys (N=26,364) at the household level.
#We also attempted to survey all voters without phone numbers using the mail-to-online surveys (N=9,634)
table(subset(t1.t0.phone_mail.data, is.na(has_phone), universe))
#for 35,998 voters assigned to mail-to-online surveys in all
table(t1.t0.phone_mail.data$universe)
#Of the 35,998 voters assigned to the mail-to-online surveys
table(t1.t0.phone_mail.data$universe)
#1,894 completed the first round
table(t1.t0.phone_mail.data$online_survey_t0)
#for a response rate of 5.3%
1894/35998
#Of these, we sampled 874 to be asked to participate in the second round of surveying
table(t1.t0.phone_mail.data$solicit_t1, t1.t0.phone_mail.data$universe)
#619 completed this second round
table(t1.t0.phone_mail.data$online_survey_t1)
#for a reinterview rate of 71%
619/874
#Of the 10,722 voters assigned to phone surveys
table(t1.t0.phone_mail.data$universe)
#532 completed them
table(t1.t0.phone_mail.data$phone_survey_t0)
#yielding a response rate of 5%
532/10722
#eventually completed 190 second round surveys
table(t1.t0.phone_mail.data$phone_survey_t1)
#reinterview rate of 36%
190/532
#Type of phone
table(t1.t0.phone_mail.data[t1.t0.phone_mail.data$universe == "phone",]$voterbase_phone_type)
#Type of phone, respondent t0
table(t1.t0.phone_mail.data[t1.t0.phone_mail.data$universe == "phone" & t1.t0.phone_mail.data$phone_survey_t0 == 1,]$voterbase_phone_type)
#Type of phone, respondent t1
table(t1.t0.phone_mail.data[t1.t0.phone_mail.data$universe == "phone" & t1.t0.phone_mail.data$phone_survey_t1 == 1,]$voterbase_phone_type)

#########################################################################
#Values for Table OA1: Observed Empirical Values in Representative Study#
#########################################################################
#Observed R values
#Telephone, First Wave
sum(t1.t0.phone_mail.data$phone_survey_t0==1, na.rm=TRUE) / 
    sum(t1.t0.phone_mail.data$universe == "phone")
#Telephone, Second Wave
sum(t1.t0.phone_mail.data$phone_survey_t1==1, na.rm=TRUE) / 
  sum(t1.t0.phone_mail.data$universe == "phone" & t1.t0.phone_mail.data$solicit_t1 == 1)
#Online, First Wave
sum(t1.t0.phone_mail.data$online_survey_t0==1, na.rm=TRUE) / 
  sum(t1.t0.phone_mail.data$universe == "online")
#Online, Second Wave
sum(t1.t0.phone_mail.data$online_survey_t1==1, na.rm=TRUE) / 
  sum(t1.t0.phone_mail.data$universe == "online" & t1.t0.phone_mail.data$solicit_t1 == 1)

#Observed rho^2 values
#Telephone, Single
(summary(lm(t1_recommend ~ t0_recommend, data = subset(t1.t0.phone_mail.data, t1.t0.phone_mail.data$universe == "phone")))$r.squared +
summary(lm(t1_importance ~ t0_importance, data = subset(t1.t0.phone_mail.data, t1.t0.phone_mail.data$universe == "phone")))$r.squared +
summary(lm(t1_benefitrisk ~ t0_benefitrisk, data = subset(t1.t0.phone_mail.data, t1.t0.phone_mail.data$universe == "phone")))$r.squared +
summary(lm(t1_immunity ~ t0_immunity, data = subset(t1.t0.phone_mail.data, t1.t0.phone_mail.data$universe == "phone")))$r.squared) /
  4
#Telephone, Multiple
summary(lm(t1_factor ~ t0_factor, data = subset(t1.t0.phone_mail.data, t1.t0.phone_mail.data$universe == "phone")))$r.squared
#Online, Single
(summary(lm(t1_recommend ~ t0_recommend, data = subset(t1.t0.phone_mail.data, t1.t0.phone_mail.data$universe == "online")))$r.squared +
    summary(lm(t1_importance ~ t0_importance, data = subset(t1.t0.phone_mail.data, t1.t0.phone_mail.data$universe == "online")))$r.squared +
    summary(lm(t1_benefitrisk ~ t0_benefitrisk, data = subset(t1.t0.phone_mail.data, t1.t0.phone_mail.data$universe == "online")))$r.squared +
    summary(lm(t1_immunity ~ t0_immunity, data = subset(t1.t0.phone_mail.data, t1.t0.phone_mail.data$universe == "online")))$r.squared) /
  4
#Online, Multiple
summary(lm(t1_factor ~ t0_factor, data = subset(t1.t0.phone_mail.data, t1.t0.phone_mail.data$universe == "online")))$r.squared

#Observed S values: These cannot be estimated from the data as these are cost parameters provided by the vendor.

###############################################################################
#Values for Table OA2: Representativeness of Mail-to-Online and Phone Surveys##
###############################################################################
se <- function(x) sqrt(var(x, na.rm = TRUE)/sum(!is.na(x)))
  
vars <- c("voterbase_age", "vf_female", "vf_white", "vf_afam", "vf_hispanic", "vf_asian", 
          "vf_g2014", "vf_g2012", "vf_g2010", "vf_g2008",  "vf_dem", "vf_rep")

#Starting Universe
for(i in 1:length(vars)) {
  print(round(mean(t1.t0.phone_mail.data[,vars[i]], na.rm = TRUE), 3)) #Mean
  print(round(se(t1.t0.phone_mail.data[,vars[i]]), 4)) #SE
}

##Voters with Phones
for(i in 1:length(vars)) {
  print(round(mean(t1.t0.phone_mail.data[t1.t0.phone_mail.data$has_phone ==1,vars[i]], na.rm = TRUE), 3))
  print(round(se(t1.t0.phone_mail.data[t1.t0.phone_mail.data$has_phone ==1,vars[i]]), 4))
}

##Online Respondents
for(i in 1:length(vars)) {
  print(round(mean(t1.t0.phone_mail.data[t1.t0.phone_mail.data$online_survey_t0 ==1,vars[i]], na.rm = TRUE), 3))
  print(round(se(t1.t0.phone_mail.data[t1.t0.phone_mail.data$online_survey_t0 ==1,vars[i]]), 4))
}

#Phone Respondents
for(i in 1:length(vars)) {
  print(round(mean(t1.t0.phone_mail.data[t1.t0.phone_mail.data$phone_survey_t0 ==1,vars[i]], na.rm = TRUE), 3))
  print(round(se(t1.t0.phone_mail.data[t1.t0.phone_mail.data$phone_survey_t0 ==1,vars[i]]), 4))
}

###Calculate N
nrow(t1.t0.phone_mail.data) #Starting Universe
sum(!is.na(t1.t0.phone_mail.data$has_phone)) #Voters with Phones
sum(!is.na(t1.t0.phone_mail.data$online_survey_t0)) #Online Respondents
sum(!is.na(t1.t0.phone_mail.data$phone_survey_t0)) #Phone Respondents

####Calculate Response Rate
sum(t1.t0.phone_mail.data$online_survey_t0==1, na.rm=TRUE) / 
  sum(t1.t0.phone_mail.data$universe == "online") #Online Respondents
sum(t1.t0.phone_mail.data$phone_survey_t0==1, na.rm=TRUE) / 
  sum(t1.t0.phone_mail.data$universe == "phone") #Phone Respondents

#Reweight to account for phone universe
#Within "entire usa" only has_phone == 1 was eligible for phone survey (79.4% of entire usa sample)
#28.91% of "entire usa" with has_phone was included in the phone survey
#tab universe if tranche == "entire usa" & has_phone == 1
#If universe == online, we need to upweight the people with has_phone
#Total has_phone = 37086: sum(t1.t0.phone_mail.data$has_phone, na.rm = TRUE)
#has_phone within online universe = 26364: sum(subset(t1.t0.phone_mail.data, universe == "online")$has_phone, na.rm = TRUE)
#pweight = 37086 / 26364 = 1.406691
t1.t0.phone_mail.data$pweight.online.t0 <- 1
t1.t0.phone_mail.data$has_phone[is.na(t1.t0.phone_mail.data$has_phone)] <- 0
t1.t0.phone_mail.data$pweight.online.t0[t1.t0.phone_mail.data$has_phone == 1 & t1.t0.phone_mail.data$universe == "online"] <- 37086 / 26364
table(t1.t0.phone_mail.data$pweight.online.t0)
#Set this back to NA for below code
t1.t0.phone_mail.data$has_phone[t1.t0.phone_mail.data$has_phone == 0] <- NA
#Need to adjust the pweight for the t1 resolicitation.
#Not everyone was eligible for the follow-up survey
#In a two-stage design, the probability weight is calculated as f1f2, 
#which means that the inverse of the sampling fraction for the first stage is multiplied 
#by the inverse of the sampling fraction for the second stage.
#http://www.ats.ucla.edu/stat/stata/seminars/applied_svy_stata9/
t1.t0.phone_mail.data$pweight.online.t1 <- t1.t0.phone_mail.data$pweight * t1.t0.phone_mail.data$pweight.online.t0
table(t1.t0.phone_mail.data$pweight.online.t1)

#Set up for the graph
vars <- c("vf_female", "vf_white", "vf_afam", "vf_hispanic", "vf_asian", "vf_dem", "vf_rep", "vf_nophone", "vf_g2014",
          "vf_g2012", "vf_g2010", "vf_g2008", "vf_under35", "vf_under35to60", "vf_over60")
vars.labels <- c("% Female", "% Modeled White", "% Modeled Black", "% Modeled Latinx", 
                 "% Modeled Asian", "% Reg Democrat", "% Reg Republican", "% without Phone", 
                 "% Voted 2014", "% Voted 2012", "% Voted 2010", "% Voted 2008",
                 "% Age 18-34", "% Age 35-60", "% Age 60+")
comparisons <- c("all",
                 "online_survey_t0", "online_survey_t1",
                 "has_phone",
                 "phone_survey_t0", "phone_survey_t1")
comparisons.labels <- c("Starting Universe",
                        "t0 Online Respondents", "t1 Online Respondents",
                        "Voters with Phones",
                        "t0 Phone Respondents", "t1 Phone Respondents")
weight.comparison <- c(NA, "pweight.online.t0", "pweight.online.t1", NA, NA, NA)
colors <- c("black", "dodgerblue2", "dodgerblue4",
            "firebrick1", "firebrick3", "firebrick4")
demographics <- create.demo.graph(data = t1.t0.phone_mail.data, vars = vars, vars.labels = vars.labels, 
                                  comparisons = comparisons, comparisons.labels = comparisons.labels,
                                  colors = colors, weight.comparison = weight.comparison)
#Figure 8: Average Administrative Data Values for Sampling Frame and Respondents
ggsave("output/demographicrepresentativeness.tiff", 
       demographics, width = 8, height = 4)

#Make the Knowledge Graphs
anes.data.16 <- read.dta('data/anes_pilot_2016_small.dta')
anes.data.16$anes.16.weight <- anes.data.16$weight
anes.data.16$anes.2016.online <- 1
anes.data.12.web <- read.dta('data/anes_pilot_2012_small_web.dta')
anes.data.12.web$anes.2012.online <- 1
anes.data.12.web$anes.12.weight_web <- anes.data.12.web$weight_web
anes.data.12.ftf <- read.dta('data/anes_pilot_2012_small_ftf.dta')
anes.data.12.ftf$anes.2012.ftf <- 1
anes.data.12.ftf$anes.12.weight_ftf <- anes.data.12.ftf$weight_ftf
t1.t0.phone_mail.data$college_degree <- t1.t0.phone_mail.data$t1_college_deg


pk.data.frames <- list(anes.data.16, anes.data.12.web, anes.data.12.ftf, t1.t0.phone_mail.data)
pk.data <- rbindlist(pk.data.frames, fill = TRUE)
pk.data <- data.frame(pk.data)

vars <- c("pk_prestimes_correct", "pk_deficit_correct", "pk_senterm_correct", "pk_spend_correct", "college_degree", "democrat", "republican", "vf_g2008", "vf_g2012")
vars.labels <- c("% Pres Terms Correct", "% Size Deficit Correct", "% Sen Terms Correct", "% Least Spend Correct", "% College Degree", "% Dem. PID", "% Rep. PID", "2008 Turnout", "2012 Turnout")
comparisons <- c("anes.2016.online", "anes.2012.online", "anes.2012.ftf", "online_survey_t0", "online_survey_t1", "phone_survey_t0", "phone_survey_t1")
comparisons.labels <- c("2016 ANES Online", "2012 ANES Online", "2012 ANES FTF", "t0 Online Respondents", "t1 Online Respondents", "t0 Phone Respondents", "t1 Phone Respondents")
weight.comparison <- c("anes.16.weight", "anes.12.weight_web", "anes.12.weight_ftf", "pweight.online.t0", "pweight.online.t1", NA, NA)
colors <- c("gray26", "gray39", "gray48",
            "dodgerblue2", "dodgerblue4",
            "firebrick1", "firebrick3")

pkrepresentativeness <- create.demo.graph(data = pk.data, vars = vars, vars.labels = vars.labels, 
                  comparisons = comparisons, comparisons.labels = comparisons.labels,
                  colors = colors, weight.comparison = weight.comparison)
#Figure 9: Covariates Collected in 2012 and 2016 ANES, Telephone, and Online Surveys
ggsave("output/pkrepresentativeness.tiff", 
       pkrepresentativeness, width = 7.5, height = 3, scale = 1.2)

##########################################################################
#Table OA3: Comparing Mail-to-Online Respondents to 2016 ANES Pilot Study#
##########################################################################
pk.data$strong_dem <- ifelse(pk.data$pid_t0_t1 == 3 | pk.data$strong_dem == 1, 1, 0)
  pk.data$strong_dem[is.na(pk.data$strong_dem)] <- 0
pk.data$weak_dem <- ifelse(pk.data$pid_t0_t1 == 2 | pk.data$weak_dem == 1, 1, 0)
  pk.data$weak_dem[is.na(pk.data$weak_dem)] <- 0
pk.data$lean_dem <- ifelse(pk.data$pid_t0_t1 == 1 | pk.data$lean_dem == 1, 1, 0)
  pk.data$lean_dem[is.na(pk.data$lean_dem)] <- 0
pk.data$independent <- ifelse(pk.data$pid_t0_t1 == 0 | pk.data$independent == 1, 1, 0)
  pk.data$independent[is.na(pk.data$independent)] <- 0
pk.data$lean_rep <- ifelse(pk.data$pid_t0_t1 == -1 | pk.data$lean_rep == 1, 1, 0)
  pk.data$lean_rep[is.na(pk.data$lean_rep)] <- 0
pk.data$weak_rep <- ifelse(pk.data$pid_t0_t1 == -2 | pk.data$weak_rep == 1, 1, 0)
  pk.data$weak_rep[is.na(pk.data$weak_rep)] <- 0
pk.data$strong_rep <- ifelse(pk.data$pid_t0_t1 == -3 | pk.data$strong_rep == 1, 1, 0)
  pk.data$strong_rep[is.na(pk.data$strong_rep)] <- 0
vars <- c("pk_deficit_correct", "pk_spend_correct", 
          "strong_dem", "weak_dem", "lean_dem",
          "independent",
          "lean_rep", "weak_rep", "strong_rep",
          "college_degree")

#2016 ANES Online Survey (unweighted)
for(i in 1:length(vars)) {
  print(round(mean(pk.data[pk.data$anes.2016.online ==1,vars[i]], na.rm = TRUE), 4))
  print(round(se(pk.data[pk.data$anes.2016.online ==1,vars[i]]), 4))
}

#2016 ANES Online Survey (weighted)
weight.se <- function(x, wt) sqrt(wt.var(x, wt)/sum(!is.na(x)))
for(i in 1:length(vars)) {
  print(round(weighted.mean(pk.data[pk.data$anes.2016.online == 1,vars[i]], pk.data[pk.data$anes.2016.online == 1,"anes.16.weight"], na.rm = TRUE), 4))
  print(round(weight.se(pk.data[pk.data$anes.2016.online == 1,vars[i]], pk.data[pk.data$anes.2016.online == 1,"anes.16.weight"]), 4))
}

#Online Respondents
for(i in 1:length(vars)) {
  print(round(mean(pk.data[pk.data$online_survey_t0 ==1,vars[i]], na.rm = TRUE), 4))
  print(round(se(pk.data[pk.data$online_survey_t0 ==1,vars[i]]), 4))
}

###Calculate N
sum(!is.na(pk.data$anes.2016.online)) #ANES Online, unweighted and weighted
sum(!is.na(t1.t0.phone_mail.data$online_survey_t0)) #Online Respondents




#Design Effects of Online and Phone Surveys and Panels


#ebalance reweights the covariate distributions from a control group to match target moments that are computed from a treatment group
#Replace age with mean for missing values
t1.t0.phone_mail.data$voterbase_age[is.na(t1.t0.phone_mail.data$voterbase_age)] <- mean(t1.t0.phone_mail.data$voterbase_age, na.rm=TRUE)

# These are the variables we will use for calculating weights
weighting.vars <- c("vf_female", "vf_white", "vf_afam", "vf_hispanic", "vf_asian",
                    "vf_dem", "vf_rep", "vf_g2014", "vf_g2012", "vf_g2010", "vf_g2008", "voterbase_age")


get.weighting.dataset.eb <- function(data, responded, weighting.vars, design.weights = NULL){
  # Create "treatment group", which is the starting universe to reweight back to.
  Treatment <- data[,weighting.vars]
  Treatment <- getsquares(Treatment) #Generate Matrix of Squared Terms; creates age^2
  Treatment$match <- 1
  Treatment$design.weights <- 1
  
  # Subset to responded set
  df <- data[responded, weighting.vars]
  df <- getsquares(df)
  
  # Merge in weighting variables
  if(is.null(design.weights)) {
    df$design.weights <- rep(1, nrow(df))
  } else {
    df$design.weights <- design.weights[responded]
  }
  
  df$match <- 0
  
  df <- rbind.data.frame(Treatment, df)
  
  return(df)
}

get.eb.survey.weights <- function(data, # Data frame with all data
                                  responded, # A boolean vector indicating which rows are the survey sample
                                  weighting.vars, # A vector of variable names to weight
                                  design.weights = NULL # Design weights, if applicable
) {
  df <- get.weighting.dataset.eb(data, responded, weighting.vars, design.weights)
  
  # Conduct eb
  X <- as.matrix(df[,setdiff(names(df), c('match', 'design.weights'))])
  eb.out <- ebalance(Treatment = df$match,
                     X = X,
                     constraint.tolerance = 0.01)
  return(eb.out$w)
}

get.weighting.dataset.logit <- function(data, responded,
                                        weighting.vars, design.weights = NULL){
  df <- t1.t0.phone_mail.data[,weighting.vars]
  df <- getsquares(df) # creates age^2
  
  df$responded <- as.numeric(responded)
  
  # Merge in weighting variables
  if(is.null(design.weights)) {
    df$design.weights <- rep(1, nrow(df))
  } else {
    df$design.weights <- design.weights
    df$design.weights[is.na(df$design.weights)] <- 1
  }
  
  return(df)
}

get.logit.survey.weights <- function(
  data, # Sampling frame and responses.
  responded, # Boolean vector indicating which individuals responded.
  weighting.vars, # List of variables to use for weighting.
  design.weights # A vector of design weights, 1 / prob of being sampled for follow-up.
){
  df <- get.weighting.dataset.logit(data, responded,
                                    weighting.vars, design.weights)
  x <- as.matrix(df[,setdiff(names(df), c('match', 'design.weights', 'responded'))])
  
  glm.out <- glm(df$responded ~ x,
                 weights = df$design.weights,
                 family = quasibinomial(link = 'logit'))
  #print(summary(glm.out))
  df$predicted.prob.response <- predict(glm.out, type = 'response')
  pweights <- 1 / df$predicted.prob.response[df$responded==1]
  return(pweights)
}



get.survey.weights <- function(use.eb, data, responded, weighting.vars, design.weights = NULL){
  if(use.eb) survey.weights <- get.eb.survey.weights(data, responded, weighting.vars, design.weights)
  else survey.weights <- get.logit.survey.weights(data, responded, weighting.vars, design.weights)
  return(survey.weights / mean(survey.weights, na.rm = TRUE))
}


effective.sample.size <- function(sampling.weights) sum(sampling.weights)^2 / sum(sampling.weights^2)
design.effect <- function(sampling.weights) length(sampling.weights) / effective.sample.size(sampling.weights)
# .weighted when accounting for the design weights we had going in; see Footnote 9
design.effect.weighted <- function(sampling.weights, design.weights, responded) design.effect(sampling.weights * design.weights[responded]) / design.effect(design.weights[responded])

design.effect.wrapper <- function(use.eb = FALSE, data, responded, weighting.vars) round(design.effect(get.survey.weights(use.eb, data, responded, weighting.vars)), 2)
design.effect.wrapper.weighted <- function(use.eb = FALSE, data, responded, weighting.vars, design.weights.var.name){
  design.weights <- data[, design.weights.var.name]
  design.weights <- design.weights / mean(design.weights, na.rm = TRUE)
  survey.weights <- get.survey.weights(use.eb, data, responded, weighting.vars, design.weights)
  return(round(design.effect.weighted(survey.weights, design.weights, responded), 2))
}

get.all.design.effects <- function(use.eb){
  return(list(
    online.t0 = design.effect.wrapper.weighted(use.eb, t1.t0.phone_mail.data, t1.t0.phone_mail.data$online_survey_t0 == 1, weighting.vars, 'pweight.online.t0'),
    online.t1 = design.effect.wrapper.weighted(use.eb, t1.t0.phone_mail.data, t1.t0.phone_mail.data$online_survey_t1 == 1, weighting.vars, 'pweight.online.t1'),
    phone.t0 = design.effect.wrapper(use.eb, t1.t0.phone_mail.data, t1.t0.phone_mail.data$phone_survey_t0 == 1, weighting.vars),
    phone.t1 = design.effect.wrapper(use.eb, t1.t0.phone_mail.data, t1.t0.phone_mail.data$phone_survey_t1 == 1, weighting.vars)
  ))
}

t1.t0.phone_mail.data$online_survey_t0[is.na(t1.t0.phone_mail.data$online_survey_t0)] <- 0
t1.t0.phone_mail.data$online_survey_t1[is.na(t1.t0.phone_mail.data$online_survey_t1)] <- 0
t1.t0.phone_mail.data$phone_survey_t0[is.na(t1.t0.phone_mail.data$phone_survey_t0)] <- 0
t1.t0.phone_mail.data$phone_survey_t1[is.na(t1.t0.phone_mail.data$phone_survey_t1)] <- 0

eb.design.effects <- get.all.design.effects(TRUE)
logit.design.effects <- get.all.design.effects(FALSE)

#This creates the output required for Table 5: Design Effects of Online and Phone Surveys and Panels.
logit.design.effects
eb.design.effects


